import pandas as pd
import numpy as np
# Import graph Libraries
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
# Text Processing Libraries
import string
import re
import nltk
from wordcloud import WordCloud,STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
#import ML packages
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
|=> Data Loading
- Read Data
- Data Insights
|=> Data Visalisation
- Univariate Analysis
- Bi-Variate Analysis
|=> Model Comparison
- Linear Regression
- GradientBoostingRegressor
- Decision Tree Regression
- Random Forest Regression
- XGB Regression
|=> Metrics
# Loading Data
data = pd.read_csv("AB_US_2020.csv")
C:\Users\soura\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (4) have mixed types.Specify dtype option on import or set low_memory=False.
data.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | city | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 38585 | Charming Victorian home - twin beds + breakfast | 165529 | Evelyne | NaN | 28804 | 35.65146 | -82.62792 | Private room | 60 | 1 | 138 | 16/02/20 | 1.14 | 1 | 0 | Asheville |
| 1 | 80905 | French Chic Loft | 427027 | Celeste | NaN | 28801 | 35.59779 | -82.55540 | Entire home/apt | 470 | 1 | 114 | 07/09/20 | 1.03 | 11 | 288 | Asheville |
| 2 | 108061 | Walk to stores/parks/downtown. Fenced yard/Pet... | 320564 | Lisa | NaN | 28801 | 35.60670 | -82.55563 | Entire home/apt | 75 | 30 | 89 | 30/11/19 | 0.81 | 2 | 298 | Asheville |
| 3 | 155305 | Cottage! BonPaul + Sharky's Hostel | 746673 | BonPaul | NaN | 28806 | 35.57864 | -82.59578 | Entire home/apt | 90 | 1 | 267 | 22/09/20 | 2.39 | 5 | 0 | Asheville |
| 4 | 160594 | Historic Grove Park | 769252 | Elizabeth | NaN | 28801 | 35.61442 | -82.54127 | Private room | 125 | 30 | 58 | 19/10/15 | 0.52 | 1 | 0 | Asheville |
data.shape
(226030, 17)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 226030 entries, 0 to 226029 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 226030 non-null int64 1 name 226002 non-null object 2 host_id 226030 non-null int64 3 host_name 225997 non-null object 4 neighbourhood_group 110185 non-null object 5 neighbourhood 226030 non-null object 6 latitude 226030 non-null float64 7 longitude 226030 non-null float64 8 room_type 226030 non-null object 9 price 226030 non-null int64 10 minimum_nights 226030 non-null int64 11 number_of_reviews 226030 non-null int64 12 last_review 177428 non-null object 13 reviews_per_month 177428 non-null float64 14 calculated_host_listings_count 226030 non-null int64 15 availability_365 226030 non-null int64 16 city 226030 non-null object dtypes: float64(3), int64(7), object(7) memory usage: 29.3+ MB
data.describe()
| id | host_id | latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.260300e+05 | 2.260300e+05 | 226030.000000 | 226030.000000 | 226030.000000 | 2.260300e+05 | 226030.000000 | 177428.00000 | 226030.000000 | 226030.000000 |
| mean | 2.547176e+07 | 9.352385e+07 | 35.662829 | -103.220662 | 219.716529 | 4.525490e+02 | 34.506530 | 1.43145 | 16.698562 | 159.314856 |
| std | 1.317814e+07 | 9.827422e+07 | 6.849855 | 26.222091 | 570.353609 | 2.103376e+05 | 63.602914 | 1.68321 | 51.068966 | 140.179628 |
| min | 1.090000e+02 | 2.300000e+01 | 18.920990 | -159.714900 | 0.000000 | 1.000000e+00 | 0.000000 | 0.01000 | 1.000000 | 0.000000 |
| 25% | 1.515890e+07 | 1.399275e+07 | 32.761783 | -118.598115 | 75.000000 | 1.000000e+00 | 1.000000 | 0.23000 | 1.000000 | 0.000000 |
| 50% | 2.590916e+07 | 5.138266e+07 | 37.261125 | -97.817200 | 121.000000 | 2.000000e+00 | 8.000000 | 0.81000 | 2.000000 | 140.000000 |
| 75% | 3.772624e+07 | 1.497179e+08 | 40.724038 | -76.919322 | 201.000000 | 7.000000e+00 | 39.000000 | 2.06000 | 6.000000 | 311.000000 |
| max | 4.556085e+07 | 3.679176e+08 | 47.734620 | -70.995950 | 24999.000000 | 1.000000e+08 | 966.000000 | 44.06000 | 593.000000 | 365.000000 |
# Missing values %
# Count missing values
missing_values = data.isna().sum()
# Get percentage
missing_values= missing_values/data.shape[0] *100
missing_values
id 0.000000 name 0.012388 host_id 0.000000 host_name 0.014600 neighbourhood_group 51.252046 neighbourhood 0.000000 latitude 0.000000 longitude 0.000000 room_type 0.000000 price 0.000000 minimum_nights 0.000000 number_of_reviews 0.000000 last_review 21.502455 reviews_per_month 21.502455 calculated_host_listings_count 0.000000 availability_365 0.000000 city 0.000000 dtype: float64
# Bar plot for missing values per feature
px.bar(missing_values,title="Missing Percentage",color = missing_values.index)
# Getting features containing numerical features
numrical_df = data.select_dtypes(include=['int64','float64'])
numrical_df.head()
| id | host_id | latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 38585 | 165529 | 35.65146 | -82.62792 | 60 | 1 | 138 | 1.14 | 1 | 0 |
| 1 | 80905 | 427027 | 35.59779 | -82.55540 | 470 | 1 | 114 | 1.03 | 11 | 288 |
| 2 | 108061 | 320564 | 35.60670 | -82.55563 | 75 | 30 | 89 | 0.81 | 2 | 298 |
| 3 | 155305 | 746673 | 35.57864 | -82.59578 | 90 | 1 | 267 | 2.39 | 5 | 0 |
| 4 | 160594 | 769252 | 35.61442 | -82.54127 | 125 | 30 | 58 | 0.52 | 1 | 0 |
numrical_df.shape
(226030, 10)
# plotting data distributuion of every numerical features
fig = make_subplots(rows=4, cols=2,subplot_titles=numrical_df.columns[2:])
fig.add_trace(
go.Box(x = numrical_df.latitude,),
row=1, col=1)
fig.add_trace(
go.Box(x = numrical_df.longitude,),
row=1, col=2)
fig.add_trace(
go.Box(x = numrical_df.price,),
row=2, col=1)
fig.add_trace(
go.Box(x = numrical_df.minimum_nights,),
row=2, col=2)
fig.add_trace(
go.Box(x = numrical_df.number_of_reviews,),
row=3, col=1)
fig.add_trace(
go.Box(x = numrical_df.reviews_per_month,),
row=3, col=2)
fig.add_trace(
go.Box(x = numrical_df.calculated_host_listings_count,),
row=4, col=1)
fig.add_trace(
go.Box(x = numrical_df.availability_365,),
row=4, col=2)
fig.show()
# 3D scatter plot between lattitude longitude and pice to get price clusters
px.scatter_3d(data,x ="latitude",y="longitude",z="price",color= data.room_type)
# How reviews affect duration of stay
px.scatter(x=data.reviews_per_month,y = data.minimum_nights,color=data.room_type)
# Scatter plot between price and minimum nights
px.box(data,x=data.price,color=data.minimum_nights,title = "Price variance With Nights")
# How room type affetcs price
px.box(data,x=data.price,color=data.room_type,title="Price Variance with room_type")
fig = go.Figure()
fig.add_trace(
go.Scattergeo(lat=data["latitude"],
lon=data["longitude"],
mode="markers",
text=data.price,
hoverinfo="text",
showlegend=True,
name='Price',
marker=dict(color="crimson", size=4, opacity=0.8)),
)
fig.update_geos(
projection_type="orthographic",
landcolor="green",
oceancolor="MidnightBlue",
showocean=True,
lakecolor="LightBlue",
projection_rotation_lon=-92,
projection_rotation_lat=15
)
#Getting features with object type
categorical_df = data.select_dtypes(include=['object'])
categorical_df.head()
| name | host_name | neighbourhood_group | neighbourhood | room_type | last_review | city | |
|---|---|---|---|---|---|---|---|
| 0 | Charming Victorian home - twin beds + breakfast | Evelyne | NaN | 28804 | Private room | 16/02/20 | Asheville |
| 1 | French Chic Loft | Celeste | NaN | 28801 | Entire home/apt | 07/09/20 | Asheville |
| 2 | Walk to stores/parks/downtown. Fenced yard/Pet... | Lisa | NaN | 28801 | Entire home/apt | 30/11/19 | Asheville |
| 3 | Cottage! BonPaul + Sharky's Hostel | BonPaul | NaN | 28806 | Entire home/apt | 22/09/20 | Asheville |
| 4 | Historic Grove Park | Elizabeth | NaN | 28801 | Private room | 19/10/15 | Asheville |
# Room type Distribution
px.bar(data.room_type.value_counts(),color=data.room_type.unique(),title="Room_Type")
# Different cities in data
px.bar(data.city.value_counts(),color=data.city.unique(),title="City Counts")
# WordCloud of reviews
text = data.name.dropna()
word_list = " ".join(text.str.lower().values)
plt.figure(figsize=(10,8))
plt.imshow(WordCloud(width=800,height=600,min_font_size=10,stopwords=STOPWORDS).generate(word_list))
plt.title('WordCloud',fontsize=18,fontweight='bold')
plt.axis('off')
plt.show()
# Getting state names from cities
states_dic = {'Asheville':'NC','Austin':'TX','Boston':'MA','Broward County':'FL','Cambridge':'MA','Chicago':'IL','Clark County':'NV','Columbus':'OH','Denver':'CO','Hawaii':'HI','Jersey City':'NJ',
'Los Angeles':'SC','Nashville':'TN','New Orleans':'MS','New York City':'NY','Oakland':'CA','Pacific Grove':'CA','Portland':'OR','Rhode Island':'RI','Salem':'MA','San Clara Country':'CA',
'Santa Cruz County':'CA','San Diego':'CA','San Francisco':'CA','San Mateo County':'CA','Seattle':'WA','Twin Cities MSA':'MN','Washington D.C.':'DC'}
categorical_df['state'] = categorical_df['city'].apply(lambda x : states_dic[x])
<ipython-input-28-2df487dc1d9a>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# State count in data
px.bar(categorical_df.state.value_counts(),color=categorical_df.state.unique(),title="State Property Counts")
categorical_df["price"] = data.price
# Grouping data with state
state_ = categorical_df.groupby(by='state')
# Get mean price of each state
state_color = state_.price.mean()
px.choropleth(state_color.values,locations=state_color.index,color=state_color.values ,locationmode="USA-states", scope="usa",title='Average Price At Each State', color_continuous_scale=px.colors.diverging.Portland)
<ipython-input-30-5504c8152c8a>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Popularity of a state
categorical_df["number_of_reviews"] = data.number_of_reviews
state_ = categorical_df.groupby(by='state')
# Get mean reviews of each state property
review_color = state_.number_of_reviews.mean()
px.choropleth(review_color.values,locations=review_color.index,color=review_color.values ,locationmode="USA-states", scope="usa",title='Average Reviews At Each State', color_continuous_scale=px.colors.diverging.Portland)
<ipython-input-31-2f4d41f496fe>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data = pd.read_csv("AB_US_2020.csv")
C:\Users\soura\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (4) have mixed types.Specify dtype option on import or set low_memory=False.
# Dropping Null Values
data.drop(["id","host_id","host_name","neighbourhood_group","neighbourhood","last_review"],axis=1,inplace=True)
data.dropna(inplace=True)
numrical_df = data.select_dtypes(include=['int64','float64'])
categorical_df = data.select_dtypes(include=['object'])
numeric_features = numrical_df.columns
# Removing Outliers 10% of lower quartile and 10% from upper data distribution
lower_bound = .10
upper_bound = .90
# Removing Outliers as some property have unexpectedly high/low price
data = data[data['price'].between(data['price'].quantile(lower_bound), data['price'].quantile(upper_bound), inclusive=True)]
data = data[data['minimum_nights'].between(data['minimum_nights'].quantile(lower_bound), data['minimum_nights'].quantile(upper_bound), inclusive=True)]
data = data[data['calculated_host_listings_count'].between(data['calculated_host_listings_count'].quantile(lower_bound), data['calculated_host_listings_count'].quantile(upper_bound), inclusive=True)]
data = data[data['price'].between(data['price'].quantile(lower_bound), data['price'].quantile(upper_bound), inclusive=True)]
data = data[data['price'].between(data['price'].quantile(lower_bound), data['price'].quantile(upper_bound), inclusive=True)]
data = data[data['number_of_reviews'] > 0]
# Features after removing outliers
fig, axes = plt.subplots(nrows=2, ncols=4)
aux = 0
fig.set_figheight(17)
fig.set_figwidth(25)
for row in axes:
for col in row:
data[numeric_features[aux]].plot(kind='kde',ax=col)
col.set_title(numeric_features[aux] +' Distribution',fontsize=16,fontweight='bold')
aux+=1
# Vectorizing text data
vectorizer = TfidfVectorizer()
name_vector = vectorizer.fit_transform(data.name)
# Getting mean of each value of extracted vector
data.name = np.array(name_vector.mean(axis=1)).ravel()
data.head()
| name | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | city | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 0.000165 | 35.60670 | -82.55563 | Entire home/apt | 75 | 30 | 89 | 0.81 | 2 | 298 | Asheville |
| 3 | 0.000110 | 35.57864 | -82.59578 | Entire home/apt | 90 | 1 | 267 | 2.39 | 5 | 0 | Asheville |
| 4 | 0.000097 | 35.61442 | -82.54127 | Private room | 125 | 30 | 58 | 0.52 | 1 | 0 | Asheville |
| 5 | 0.000080 | 35.61856 | -82.55276 | Entire home/apt | 134 | 7 | 54 | 0.49 | 1 | 294 | Asheville |
| 8 | 0.000127 | 35.61929 | -82.48114 | Entire home/apt | 71 | 28 | 537 | 5.01 | 1 | 207 | Asheville |
#heat map using Pearson's coefficient
plt.figure(figsize=(16, 6))
sns.heatmap(data.corr(), annot=True)
plt.title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);
# One hot encoding categorical feature
data_final = pd.get_dummies(data)
# Removing features and target
Y = data_final.price
X = data_final.drop("price",axis=1)
# Splitting data in ration 70:30
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3)
#numerical pipeline
scaler=StandardScaler()
#apply scaler to numerical data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
1. LinearRegression
2. GradientBoostingRegressor
3. DecisionTreeRegressor
4. RandomForestRegressor
5. XGB Regression
index = ["LinearRegression","GradientBoostingRegressor","DecisionTreeRegressor","RandomForestRegressor","XGBRegressor"]
results = pd.DataFrame(columns=["MSE","MAE","RMSE"],index=index)
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import GridSearchCV
# Fit Linear Regression
lin_reg = LinearRegression(n_jobs=-1)
lin_reg.fit(X_train,Y_train)
pred_ = lin_reg.predict(X_test)
mae = mean_absolute_error(Y_test,pred_)
mse = mean_squared_error(Y_test,pred_)
rmse = mean_squared_error(Y_test,pred_,squared=False)
# Getting metrics values
print("Mean Squared Error :: ",mse)
print("Mean Absolute Error :: ",mae)
print("Root Mean Squared Error :: ",rmse)
results.loc["LinearRegression"] = [mse,mae,rmse]
Mean Squared Error :: 971.6391289086546 Mean Absolute Error :: 25.86436835895062 Root Mean Squared Error :: 31.171126526140416
# Fit GradientBoostingRegressor and get best parameter
gbr = GradientBoostingRegressor()
gbr_param_grid = [{"learning_rate":[0.1,0.01,0.001],"n_estimators":[50,100,150]}]
model_gbr = GridSearchCV(gbr,gbr_param_grid,cv=10,scoring="neg_mean_absolute_error")
model_gbr.fit(X_train,Y_train)
GridSearchCV(cv=10, estimator=GradientBoostingRegressor(),
param_grid=[{'learning_rate': [0.1, 0.01, 0.001],
'n_estimators': [50, 100, 150]}],
scoring='neg_mean_absolute_error')
# Fitting GradientBoostingRegressor with best parameters
gbr_reg = GradientBoostingRegressor(**model_gbr.best_params_)
gbr_reg.fit(X_train,Y_train)
pred_ = gbr_reg.predict(X_test)
# Get Metrics score
mae = mean_absolute_error(Y_test,pred_)
mse = mean_squared_error(Y_test,pred_)
rmse = mean_squared_error(Y_test,pred_,squared=False)
print("Mean Squared Error :: ",mse)
print("Mean Absolute Error :: ",mae)
print("Root Mean Squared Error :: ",rmse)
results.loc["GradientBoostingRegressor"] = [mse,mae,rmse]
Mean Squared Error :: 917.6520978941544 Mean Absolute Error :: 25.089499953326364 Root Mean Squared Error :: 30.292773030776736
# Define DecisionTreeRegressor with parameter grid
tree_reg = DecisionTreeRegressor()
tree_param_grid = [{"max_depth":list(range(1,10)),"max_leaf_nodes":list(range(2,10,2))}]
model_tree = GridSearchCV(tree_reg,tree_param_grid,cv=10,scoring="neg_mean_absolute_error")
model_tree.fit(X_train,Y_train)
GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'max_leaf_nodes': [2, 4, 6, 8]}],
scoring='neg_mean_absolute_error')
# Get best parameter and fit model for best metrics
tree_reg = DecisionTreeRegressor(**model_tree.best_params_)
tree_reg.fit(X_train,Y_train)
pred_ = tree_reg.predict(X_test)
mae = mean_absolute_error(Y_test,pred_)
mse = mean_squared_error(Y_test,pred_)
rmse = mean_squared_error(Y_test,pred_,squared=False)
print("Mean Squared Error :: ",mse)
print("Mean Absolute Error :: ",mae)
print("Root Mean Squared Error :: ",rmse)
results.loc["DecisionTreeRegressor"] = [mse,mae,rmse]
Mean Squared Error :: 997.8994684451395 Mean Absolute Error :: 26.295086281579383 Root Mean Squared Error :: 31.589546822408508
forest_reg = RandomForestRegressor(n_jobs=-1)
forest_param_grid = [{"n_estimators":[50,100,200,250],"max_depth":[5,10,25,50],"max_features":[1,2,4,8,10]}]
model_forest = GridSearchCV(forest_reg,forest_param_grid,cv=10,scoring="neg_mean_absolute_error")
model_forest.fit(X_train,Y_train)
GridSearchCV(cv=10, estimator=RandomForestRegressor(n_jobs=-1),
param_grid=[{'max_depth': [5, 10, 25, 50],
'max_features': [1, 2, 4, 8, 10],
'n_estimators': [50, 100, 200, 250]}],
scoring='neg_mean_absolute_error')
forest_reg = RandomForestRegressor(**model_forest.best_params_)
forest_reg.fit(X_train,Y_train)
pred_ = forest_reg.predict(X_test)
mae = mean_absolute_error(Y_test,pred_)
mse = mean_squared_error(Y_test,pred_)
rmse = mean_squared_error(Y_test,pred_,squared=False)
print("Mean Squared Error :: ",mse)
print("Mean Absolute Error :: ",mae)
print("Root Mean Squared Error :: ",rmse)
results.loc["RandomForestRegressor"] = [mse,mae,rmse]
Mean Squared Error :: 863.5057392990154 Mean Absolute Error :: 24.009467182462814 Root Mean Squared Error :: 29.385468165387724
xgb_reg = XGBRegressor(n_jobs=-1)
xgb_param_grid = [{"n_estimators":[10,50,100],"max_depth":[5,10,50],"learning_rate":[0.1,0.01,0.001]}]
model_xgb = GridSearchCV(xgb_reg,xgb_param_grid,cv=10,scoring="neg_mean_absolute_error")
model_xgb.fit(X_train,Y_train)
GridSearchCV(cv=10,
estimator=XGBRegressor(base_score=None, booster=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
enable_categorical=False, gamma=None,
gpu_id=None, importance_type=None,
interaction_constraints=None,
learning_rate=None, max_delta_step=None,
max_depth=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, scale_pos_weight=None,
subsample=None, tree_method=None,
validate_parameters=None, verbosity=None),
param_grid=[{'learning_rate': [0.1, 0.01, 0.001],
'max_depth': [5, 10, 50],
'n_estimators': [10, 50, 100]}],
scoring='neg_mean_absolute_error')
xgb_reg = XGBRegressor(**model_xgb.best_params_)
xgb_reg.fit(X_train,Y_train)
pred_ = xgb_reg.predict(X_test)
mae = mean_absolute_error(Y_test,pred_)
mse = mean_squared_error(Y_test,pred_)
rmse = mean_squared_error(Y_test,pred_,squared=False)
print("Mean Squared Error :: ",mse)
print("Mean Absolute Error :: ",mae)
print("Root Mean Squared Error :: ",rmse)
results.loc["XGBRegressor"] = [mse,mae,rmse]
Mean Squared Error :: 868.9667286343574 Mean Absolute Error :: 24.038391504117353 Root Mean Squared Error :: 29.478241613677664
results
| MSE | MAE | RMSE | |
|---|---|---|---|
| LinearRegression | 971.639129 | 25.864368 | 31.171127 |
| GradientBoostingRegressor | 917.652098 | 25.0895 | 30.292773 |
| DecisionTreeRegressor | 997.899468 | 26.295086 | 31.589547 |
| RandomForestRegressor | 863.505739 | 24.009467 | 29.385468 |
| XGBRegressor | 868.966729 | 24.038392 | 29.478242 |
# Plotting Mean Squared Error for every model
fig = go.Figure()
fig.add_trace(go.Bar(y=results.MSE.values,x =results.MSE.index,marker=dict(color=results.MSE.values) ))
fig.add_trace(go.Line(y = results.MSE.values,x =results.MSE.index))
fig.update_layout(title=go.layout.Title(text="Mean Squared Error",
font=go.layout.title.Font(size=25)))
fig.show()
C:\Users\soura\anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:378: DeprecationWarning: plotly.graph_objs.Line is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.scatter.Line - plotly.graph_objs.layout.shape.Line - etc.
# Plotting Mean Absolute Error for every model
fig = go.Figure()
fig.add_trace(go.Bar(y=results.MAE.values,x =results.MAE.index,marker=dict(color=results.MAE.values) ))
fig.add_trace(go.Line(y = results.MAE.values,x =results.MAE.index))
fig.update_layout(title=go.layout.Title(text="Mean Absolute Error",
font=go.layout.title.Font(size=25)))
fig.show()
# Plotting Root Mean Squared Error for every model
fig = go.Figure()
fig.add_trace(go.Bar(y=results.RMSE.values,x =results.RMSE.index,marker=dict(color=results.RMSE.values) ))
fig.add_trace(go.Line(y = results.RMSE.values,x =results.RMSE.index))
fig.update_layout(title=go.layout.Title(text="Root Mean Squared Error",
font=go.layout.title.Font(size=25)))
fig.show()